Carregando bibliotecas

library(DT)
library(ROCR)
library(MASS)
library(caret)
library(dplyr)
library(stats)
library(plotly)
library(readxl)
library(caTools)
library(ggplot2)
library(corrplot)
library(varhandle)

Exercícios de Múltipla Escolha

Exercício Computacional - 1

set.seed(12)

# Leitura e Filtragem dos dados
df <- iris
especies_b <- to.dummy(iris$Species, "species") %>% as.data.frame()

especies <- especies_b$species.virginica
df <- data.frame(df[, 1:4], especies) %>% 
  dplyr::mutate(especies = as.factor(especies))
  
# Divisão dos dados para treino e teste - 90/10
divisao_df = sample.split(df$especies, SplitRatio = 0.90)
df_treino = subset(df, divisao_df == TRUE)
df_teste = subset(df, divisao_df == FALSE)

# Gráficos de dispersão
p <- df_treino %>% 
  ggplot(aes(x = Petal.Length, y = Petal.Width, color = especies)) +
  geom_point() +
  ggtitle("DF Treino")
ggplotly(p)
p <- df_teste %>% 
  ggplot(aes(x = Petal.Length, y = Petal.Width, color = especies)) +
  geom_point() +
  ggtitle("DF Teste")
ggplotly(p)
# Construção do modelo
modelo <- glm("especies ~ .", data = df_treino, family = "binomial")
modelo
## 
## Call:  glm(formula = "especies ~ .", family = "binomial", data = df_treino)
## 
## Coefficients:
##  (Intercept)  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width  
##      -53.280        -1.182        -4.318         9.238        16.313  
## 
## Degrees of Freedom: 134 Total (i.e. Null);  130 Residual
## Null Deviance:       171.9 
## Residual Deviance: 10.94     AIC: 20.94
# Predições
predicoes_teste <- predict(modelo, df_teste, type = "response") %>% 
  as.numeric() %>% 
  round()
predicoes_teste
##  [1] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
flor1 <- data.frame(Sepal.Length = 6.4, Sepal.Width = 2.8, Petal.Length = 4.6, Petal.Width = 1.8)
flor2 <- data.frame(Sepal.Length = 6.3, Sepal.Width = 2.5, Petal.Length = 4.1, Petal.Width = 1.7)
predict(modelo, flor1, type = "response") %>% 
  as.numeric() %>% 
  round()
## [1] 0
predict(modelo, flor2, type = "response") %>% 
  as.numeric() %>% 
  round()
## [1] 0

Exercício Computacional - 2

confusionMatrix(data = as.factor(predicoes_teste),
                reference = as.factor(df_teste$especies),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 10  1
##          1  0  4
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.6805, 0.9983)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.01941         
##                                           
##                   Kappa : 0.8421          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9091          
##              Prevalence : 0.3333          
##          Detection Rate : 0.2667          
##    Detection Prevalence : 0.2667          
##       Balanced Accuracy : 0.9000          
##                                           
##        'Positive' Class : 1               
## 

Exercício Computacional - 3

set.seed(100)

# Leitura
df <- read.csv("../inputs/credit_dataset.csv")

# Normalização e Conversão para factor
df[["credit.duration.months"]] <- scale(df[["credit.duration.months"]], center = T, scale = T)
df[["age"]] <- scale(df[["age"]], center = T, scale = T)
df[["credit.amount"]] <- scale(df[["credit.amount"]], center = T, scale = T)

var_factors <- c('credit.rating', 'account.balance', 'previous.credit.payment.status',
                      'credit.purpose', 'savings', 'employment.duration', 'installment.rate',
                      'marital.status', 'guarantor', 'residence.duration', 'current.assets',
                      'other.credits', 'apartment.type', 'bank.credits', 'occupation', 
                      'dependents', 'telephone', 'foreign.worker')

for(i in var_factors) {
  df[[i]] <- as.factor(df[[i]])
}


# Divisão em conjunto de teste e treino 60/40
divisao_df = sample.split(df$credit.rating, SplitRatio = 0.60)
df_treino = subset(df, divisao_df == TRUE)
df_teste = subset(df, divisao_df == FALSE)


# Criação do Modelo
modelo <- glm("credit.rating ~ .", data = df_treino, family = "binomial")
summary(modelo)
## 
## Call:
## glm(formula = "credit.rating ~ .", family = "binomial", data = df_treino)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4823  -0.6585   0.3899   0.7028   2.3138  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      1.03367    1.09715   0.942 0.346120    
## account.balance2                 0.42494    0.28978   1.466 0.142532    
## account.balance3                 1.68601    0.28461   5.924 3.14e-09 ***
## credit.duration.months          -0.18457    0.14413  -1.281 0.200327    
## previous.credit.payment.status2  0.83888    0.41288   2.032 0.042175 *  
## previous.credit.payment.status3  1.75151    0.44006   3.980 6.89e-05 ***
## credit.purpose2                 -1.10025    0.49904  -2.205 0.027472 *  
## credit.purpose3                 -1.33254    0.48031  -2.774 0.005532 ** 
## credit.purpose4                 -1.76621    0.47177  -3.744 0.000181 ***
## credit.amount                   -0.34239    0.16305  -2.100 0.035734 *  
## savings2                         0.73669    0.39983   1.843 0.065400 .  
## savings3                         1.29027    0.48161   2.679 0.007383 ** 
## savings4                         0.64672    0.32181   2.010 0.044471 *  
## employment.duration2             0.35496    0.31163   1.139 0.254682    
## employment.duration3             0.77129    0.38109   2.024 0.042978 *  
## employment.duration4             0.40428    0.36096   1.120 0.262712    
## installment.rate2                0.13630    0.39674   0.344 0.731193    
## installment.rate3               -0.60469    0.43670  -1.385 0.166150    
## installment.rate4               -0.38190    0.38414  -0.994 0.320138    
## marital.status3                  0.24796    0.26044   0.952 0.341054    
## marital.status4                  0.70870    0.41647   1.702 0.088816 .  
## guarantor2                       0.46403    0.35729   1.299 0.194029    
## residence.duration2             -0.37545    0.36971  -1.016 0.309857    
## residence.duration3             -0.56620    0.41439  -1.366 0.171826    
## residence.duration4             -0.13539    0.37974  -0.357 0.721433    
## current.assets2                 -0.23866    0.32961  -0.724 0.469016    
## current.assets3                 -0.35931    0.30423  -1.181 0.237589    
## current.assets4                 -0.50886    0.53965  -0.943 0.345709    
## age                              0.08214    0.13168   0.624 0.532774    
## other.credits2                   0.13490    0.29385   0.459 0.646182    
## apartment.type2                  0.67322    0.31034   2.169 0.030062 *  
## apartment.type3                  0.47768    0.60814   0.785 0.432179    
## bank.credits2                   -0.19551    0.30245  -0.646 0.517995    
## occupation2                     -1.31950    0.86268  -1.530 0.126134    
## occupation3                     -1.65837    0.83024  -1.997 0.045777 *  
## occupation4                     -1.48500    0.87965  -1.688 0.091378 .  
## dependents2                     -0.22212    0.33562  -0.662 0.508086    
## telephone2                       0.36689    0.26467   1.386 0.165685    
## foreign.worker2                  2.35080    0.90337   2.602 0.009262 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 733.04  on 599  degrees of freedom
## Residual deviance: 540.02  on 561  degrees of freedom
## AIC: 618.02
## 
## Number of Fisher Scoring iterations: 5
# Predição DF Teste, Matriz de confusão
predi_teste <- predict(modelo, df_teste, type = "response") %>% 
                as.numeric() %>% 
                round() %>% 
                as.factor()

confusionMatrix(data = predi_teste,
                reference = as.factor(df_teste$credit.rating),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  64  44
##          1  56 236
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.7046, 0.7917)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.01553         
##                                           
##                   Kappa : 0.3873          
##                                           
##  Mcnemar's Test P-Value : 0.27133         
##                                           
##             Sensitivity : 0.8429          
##             Specificity : 0.5333          
##          Pos Pred Value : 0.8082          
##          Neg Pred Value : 0.5926          
##              Prevalence : 0.7000          
##          Detection Rate : 0.5900          
##    Detection Prevalence : 0.7300          
##       Balanced Accuracy : 0.6881          
##                                           
##        'Positive' Class : 1               
##